The dataset contains data from three power sub-meters installed in a house near central France, each measures specific areas of the house:
The original data set comes with Date and Time columns (both as string), lets convert them into a single field called DateTime:
# Merge datasets
power_df <-
bind_rows(power_2007, power_2008, power_2009)
# Create DateTime Column
## Combine Date and Time attribute values in a new attribute column
power_df <-cbind(power_df,paste(power_df$Date,power_df$Time), stringsAsFactors=FALSE)
## Give the new attribute in the 6th column a header name
## NOTE: if you downloaded more than 5 attributes you will need to change the column number)
# str(power_df)
colnames(power_df)[11] <-"DateTime"
# Move the DateTime column to be the first column (for convenience)
power_df <- power_df[,c(ncol(power_df), 1:(ncol(power_df)-1))]
power_df
The DateTime column is still a String, in order to use it’s time components needs to be converted into a POSIXct date type.
## Convert DateTime from character to POSIXct
power_df$DateTime <- as.POSIXct(power_df$DateTime, tz="Europe/Paris", format="%Y-%m-%d %H:%M:%S")
## Add the time zone
attr(power_df$DateTime, "tzone") <- "Europe/Paris"
power_df
Now, it’s possible to decompose the DateTime field into separate components for further analysis later on.
## Create "year" attribute with lubridate
power_df$year <- year(power_df$DateTime)
power_df$quarter <- quarter(power_df$DateTime)
power_df$month <- month(power_df$DateTime)
power_df$week <- week(power_df$DateTime)
power_df$weekday <- wday(power_df$DateTime, week_start=1)
power_df$day <- day(power_df$DateTime)
power_df$hour <- hour(power_df$DateTime)
power_df$minute <- minute(power_df$DateTime)
power_df
Let’s try to visualize a single week:
## Subset the second week of 2008 - All Observations
houseWeek <- filter(power_df, year == 2008 & week == 2)
## Plot subset houseWeek
plot(houseWeek$Sub_metering_1)
There are too many data points displayed, therefore is difficult to infer anything from it, let’s zoom in into a single day:
# Visualize a Single Day with Plotly
## Subset the 9th day of January 2008 - All observations
houseDay <- filter(power_df, year == 2008 & month == 1 & day == 9)
## Plot sub-meter 1
plot_ly(houseDay, x = ~houseDay$Time, y = ~houseDay$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
layout(title = "Power Consumption January 9Th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
The visualization is better, so let’s add the three sub-meters to the graph:
## Plot sub-meter 1, 2 and 3 with title, legend and labels - All observations
plot_ly(houseDay, x = ~houseDay$DateTime, y = ~houseDay$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9Th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
We can reduce the granularity from one observation per minute to one observation every 10 minutes to get a better plot, this will be called a Day visualization:
## Subset the 9th day of January 2008 - 10 Minute frequency
houseDay10 <- filter(power_df, year == 2008 & month == 1 & day == 9 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseDay10, x = ~houseDay10$DateTime, y = ~houseDay10$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9Th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Now let’s look at another time period: the 3rd week of April 2009 - 10 Minute frequency
## Subset the 3rd week of April 2009 - 10 Minute frequency
houseWeek16 <- power_df %>%
filter(year == 2009 & month == 4 & week == 16 & (minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseWeek16$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption April 16Th-22Nd, 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Too many samples, let’s reduce them to one per hour:
## Subset the 3rd week of April 2009 - 60 Minute frequency
houseWeek16 <- power_df %>%
filter(year == 2009 & month == 4 & week == 16 & (minute == 0))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 60 Minute frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseWeek16$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption April 16Th-22Nd, 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Leason: Depending on the minute we choose, we will see or not some important data points, like the high peaks consumption instants (over 70 watt/hour) shown in the 10 min frequency graph, that are gone in the 60 minutes graph above.
Moving on, let’s see a month worth of data, with measurements every 6 hours, choosing an arbitrary minute.
## Subset April 2009 - 6 hour frequency
houseWeek16 <- power_df %>%
filter(year == 2009 & month == 4 & (hour == 0 | hour == 6 | hour == 12 | hour == 18) & (minute == 23))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 3 hour frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseWeek16$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption for April, 2009",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
The Water Heater & AC seem to introduce a lot of noise, let’s see only the kitchen and Laundry Room
This way we can tell how many types a week the is someone cooking and/or doing laundry in a month:
## Subset the 3rd-4th weeks of April 2009 - 6 hour frequency
houseWeek16 <- power_df %>%
filter(year == 2009 & month == 4 & (hour == 0 | hour == 6 | hour == 12 | hour == 18) & (minute == 23))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 3 hour frequency
plot_ly(houseWeek16, x = ~houseWeek16$DateTime, y = ~houseWeek16$Sub_metering_1, name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseWeek16$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
layout(title = "Power Consumption April, 2009, samples every 6 hours",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
Let’s take a sample of one observation per week: on Mondays at 8:00pm for 2007, 2008 and 2009, and plot a time series for each sub-meter.
## Subset to one observation per week on Mondays at 8:00pm for 2007, 2008 and 2009
house070809weekly <- filter(power_df, weekday == 2 & hour == 20 & minute == 1)
## Create TS object with SubMeter3
tsSM3_070809weekly <- ts(house070809weekly$Sub_metering_3, frequency=52, start=c(2007,1))
## Plot sub-meter 3 with autoplot (you may need to install these packages)
library(ggplot2)
library(ggfortify)
## Plot sub-meter 3 with autoplot - add labels, color
autoplot(tsSM3_070809weekly, colour = 'green', xlab = "Time", ylab = "Watt Hours", main = "Water Heater & AC", lwd=1)
## Create TS object with SubMeter1
tsSM1_070809weekly <- ts(house070809weekly$Sub_metering_1, frequency=52, start=c(2007,1))
## Plot sub-meter 1 with autoplot - add labels, color
autoplot(tsSM1_070809weekly, colour = 'blue', xlab = "Time", ylab = "Watt Hours", main = "Kitchen", lwd=1)
## Create TS object with SubMeter1
tsSM2_070809weekly <- ts(house070809weekly$Sub_metering_2, frequency=52, start=c(2007,1))
## Plot sub-meter 1 with autoplot - add labels, color
autoplot(tsSM2_070809weekly, colour = 'orange', xlab = "Time", ylab = "Watt Hours", main = "Laundry Room", lwd=1)
Now with the available data, we’ll try to forecast the power consumption for each sub-meter, using a linear regression model first.
## Apply time series linear regression to the sub-meter 3 ts object and use summary to obtain R2 and RMSE from the model you built
library(forecast)
fitSM3 <- tslm(tsSM3_070809weekly ~ trend + season)
summary(fitSM3)
##
## Call:
## tslm(formula = tsSM3_070809weekly ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.9673 -5.6994 -0.0327 4.0327 13.3006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.200306 4.232597 3.119 0.00235 **
## trend 0.000629 0.015100 0.042 0.96685
## season2 -6.900941 6.214791 -1.110 0.26938
## season3 -13.234903 6.213892 -2.130 0.03554 *
## season4 -13.235532 6.213030 -2.130 0.03551 *
## season5 -7.236161 6.212204 -1.165 0.24675
## season6 -13.236790 6.211415 -2.131 0.03544 *
## season7 -13.237419 6.210663 -2.131 0.03541 *
## season8 -7.238048 6.209947 -1.166 0.24646
## season9 -7.238677 6.209267 -1.166 0.24637
## season10 -1.905973 6.208625 -0.307 0.75947
## season11 -7.573269 6.208019 -1.220 0.22526
## season12 -13.240564 6.207450 -2.133 0.03528 *
## season13 -7.241193 6.206917 -1.167 0.24603
## season14 -13.241822 6.206421 -2.134 0.03523 *
## season15 -6.909118 6.205962 -1.113 0.26814
## season16 -12.576414 6.205539 -2.027 0.04526 *
## season17 -6.910376 6.205153 -1.114 0.26800
## season18 -7.244339 6.204804 -1.168 0.24566
## season19 -3.911634 6.204492 -0.630 0.52978
## season20 -6.912263 6.204216 -1.114 0.26779
## season21 -6.912892 6.203977 -1.114 0.26773
## season22 -12.913521 6.203775 -2.082 0.03984 *
## season23 -1.580817 6.203610 -0.255 0.79936
## season24 -13.248113 6.203481 -2.136 0.03506 *
## season25 -1.248742 6.203390 -0.201 0.84086
## season26 -1.249371 6.203334 -0.201 0.84078
## season27 -7.250000 6.203316 -1.169 0.24518
## season28 -12.917296 6.203334 -2.082 0.03977 *
## season29 -12.917925 6.203390 -2.082 0.03976 *
## season30 -7.251887 6.203481 -1.169 0.24507
## season31 -6.919183 6.203610 -1.115 0.26727
## season32 -12.586479 6.203775 -2.029 0.04503 *
## season33 -12.587108 6.203977 -2.029 0.04503 *
## season34 -12.921070 6.204216 -2.083 0.03974 *
## season35 -6.588366 6.204492 -1.062 0.29075
## season36 -1.255661 6.204804 -0.202 0.84002
## season37 -12.922957 6.205153 -2.083 0.03974 *
## season38 1.743081 6.205539 0.281 0.77935
## season39 -12.590882 6.205962 -2.029 0.04503 *
## season40 -6.924844 6.206421 -1.116 0.26710
## season41 -7.258807 6.206917 -1.169 0.24489
## season42 -1.259436 6.207450 -0.203 0.83962
## season43 -6.926731 6.208019 -1.116 0.26709
## season44 -7.594027 6.208625 -1.223 0.22404
## season45 4.072011 6.209267 0.656 0.51340
## season46 -6.928618 6.209947 -1.116 0.26711
## season47 -6.929247 6.210663 -1.116 0.26712
## season48 4.403457 6.211415 0.709 0.47995
## season49 -12.930506 6.212204 -2.081 0.03985 *
## season50 0.735532 6.213030 0.118 0.90599
## season51 -6.598430 6.213892 -1.062 0.29075
## season52 -6.932393 6.214791 -1.115 0.26722
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.122 on 104 degrees of freedom
## Multiple R-squared: 0.3544, Adjusted R-squared: 0.03167
## F-statistic: 1.098 on 52 and 104 DF, p-value: 0.3383
Now lets create and plot a forecast for sub-meter 3.
## Create the forecast for sub-meter 3. Forecast ahead 20 time periods
forecastfitSM3 <- forecast(fitSM3, h=20)
## Plot the forecast for sub-meter 3.
autoplot(forecastfitSM3, xlab = "Year", ylab = "Watt Hours", main = "Water Heater & AC forecast")
The graph shows a few things to note: - The levels of confidence: 80% and 95% (showing the range at which there is an 85% or 95% probability that the forecasted value will fall into). - The levels of confidence shadow show negative values. This is because the confidence level shows a range of acceptance which is equidistant to an edge value greater than, and another one lower than 0. - Note that no forecasted values are below 0.
To correct this situation (of negative values), we can chance the limits of the plot, also we will update the confidence levels:
## Create sub-meter 3 forecast with confidence levels 80 and 90
forecastfitSM3c <- forecast(fitSM3, h=20, level=c(80,90))
## Plot sub-meter 3 forecast, limit y and add labels
plot(forecastfitSM3c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Water Heater & AC Forecast")
Now let’s forecast the other two sub-meters.
## Apply time series linear regression to the sub-meter 2 ts object and use summary to obtain R2 and RMSE from the model you built
fitSM2 <- tslm(tsSM2_070809weekly ~ trend + season)
summary(fitSM2)
##
## Call:
## tslm(formula = tsSM2_070809weekly ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6667 -0.6246 -0.0421 0.0421 27.2913
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1861071 3.8577383 0.048 0.9616
## trend 0.0008088 0.0137625 0.059 0.9533
## season2 0.4368859 5.6643798 0.077 0.9387
## season3 -0.2305895 5.6635605 -0.041 0.9676
## season4 0.4352684 5.6627746 0.077 0.9389
## season5 0.4344596 5.6620220 0.077 0.9390
## season6 11.7669842 5.6613027 2.078 0.0401 *
## season7 0.0995087 5.6606168 0.018 0.9860
## season8 -0.2346334 5.6599643 -0.041 0.9670
## season9 -0.2354421 5.6593452 -0.042 0.9669
## season10 0.4304158 5.6587595 0.076 0.9395
## season11 12.0962737 5.6582072 2.138 0.0349 *
## season12 0.0954649 5.6576883 0.017 0.9866
## season13 -0.2386772 5.6572028 -0.042 0.9664
## season14 0.7605140 5.6567508 0.134 0.8933
## season15 0.0930386 5.6563323 0.016 0.9869
## season16 0.0922298 5.6559472 0.016 0.9870
## season17 0.4247544 5.6555955 0.075 0.9403
## season18 0.0906123 5.6552774 0.016 0.9872
## season19 1.4231368 5.6549927 0.252 0.8018
## season20 0.0889947 5.6547415 0.016 0.9875
## season21 0.4215193 5.6545238 0.075 0.9407
## season22 -0.2459561 5.6543395 -0.043 0.9654
## season23 0.7532351 5.6541888 0.133 0.8943
## season24 -0.2475737 5.6540715 -0.044 0.9652
## season25 12.4182842 5.6539878 2.196 0.0303 *
## season26 -0.2491912 5.6539376 -0.044 0.9649
## season27 -0.2500000 5.6539208 -0.044 0.9648
## season28 0.0825246 5.6539376 0.015 0.9884
## season29 0.4150491 5.6539878 0.073 0.9416
## season30 -0.2524263 5.6540715 -0.045 0.9645
## season31 0.0800983 5.6541888 0.014 0.9887
## season32 0.7459561 5.6543395 0.132 0.8953
## season33 -0.2548526 5.6545238 -0.045 0.9641
## season34 -0.2556614 5.6547415 -0.045 0.9640
## season35 0.4101965 5.6549927 0.073 0.9423
## season36 0.4093877 5.6552774 0.072 0.9424
## season37 0.4085790 5.6555955 0.072 0.9425
## season38 -0.2588965 5.6559472 -0.046 0.9636
## season39 0.7402948 5.6563323 0.131 0.8961
## season40 0.0728193 5.6567508 0.013 0.9898
## season41 0.4053439 5.6572028 0.072 0.9430
## season42 0.4045351 5.6576883 0.072 0.9431
## season43 -0.2629403 5.6582072 -0.046 0.9630
## season44 0.0695842 5.6587595 0.012 0.9902
## season45 12.4021088 5.6593452 2.191 0.0307 *
## season46 -0.2653666 5.6599643 -0.047 0.9627
## season47 1.0671579 5.6606168 0.189 0.8508
## season48 13.3996825 5.6613027 2.367 0.0198 *
## season49 0.0655404 5.6620220 0.012 0.9908
## season50 12.0647316 5.6627746 2.131 0.0355 *
## season51 -0.2694105 5.6635605 -0.048 0.9622
## season52 6.0631141 5.6643798 1.070 0.2869
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.403 on 104 degrees of freedom
## Multiple R-squared: 0.3006, Adjusted R-squared: -0.04917
## F-statistic: 0.8594 on 52 and 104 DF, p-value: 0.7245
Forecast plot for sub-meter 2:
## Create sub-meter 2 forecast with confidence levels 80 and 90
forecastfitSM2c <- forecast(fitSM2, h=20, level=c(80,90))
## Plot sub-meter 2 forecast, limit y and add labels
plot(forecastfitSM2c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Laundry Room Forecast")
## Apply time series linear regression to the sub-meter 2 ts object and use summary to obtain R2 and RMSE from the model you built
fitSM1 <- tslm(tsSM1_070809weekly ~ trend + season)
summary(fitSM1)
##
## Call:
## tslm(formula = tsSM1_070809weekly ~ trend + season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.4268 -0.2399 0.0000 0.0935 26.0935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.141984 4.986001 -0.028 0.9773
## trend 0.001797 0.017788 0.101 0.9197
## season2 13.044932 7.321026 1.782 0.0777 .
## season3 0.043134 7.319967 0.006 0.9953
## season4 12.708004 7.318951 1.736 0.0855 .
## season5 0.372873 7.317978 0.051 0.9595
## season6 0.037743 7.317049 0.005 0.9959
## season7 0.035945 7.316162 0.005 0.9961
## season8 0.034148 7.315319 0.005 0.9963
## season9 0.032351 7.314519 0.004 0.9965
## season10 0.030554 7.313762 0.004 0.9967
## season11 0.695423 7.313048 0.095 0.9244
## season12 0.360292 7.312377 0.049 0.9608
## season13 0.358495 7.311750 0.049 0.9610
## season14 7.690031 7.311166 1.052 0.2953
## season15 12.688234 7.310625 1.736 0.0856 .
## season16 0.019770 7.310127 0.003 0.9978
## season17 0.017973 7.309673 0.002 0.9980
## season18 13.016175 7.309261 1.781 0.0779 .
## season19 0.014378 7.308893 0.002 0.9984
## season20 0.012581 7.308569 0.002 0.9986
## season21 9.677450 7.308287 1.324 0.1883
## season22 0.008986 7.308049 0.001 0.9990
## season23 24.340522 7.307854 3.331 0.0012 **
## season24 13.005392 7.307703 1.780 0.0780 .
## season25 0.003595 7.307595 0.000 0.9996
## season26 0.001797 7.307530 0.000 0.9998
## season27 12.333333 7.307508 1.688 0.0945 .
## season28 -0.001797 7.307530 0.000 0.9998
## season29 -0.003595 7.307595 0.000 0.9996
## season30 -0.005392 7.307703 -0.001 0.9994
## season31 0.326144 7.307854 0.045 0.9645
## season32 -0.008986 7.308049 -0.001 0.9990
## season33 -0.010784 7.308287 -0.001 0.9988
## season34 -0.012581 7.308569 -0.002 0.9986
## season35 0.652289 7.308893 0.089 0.9291
## season36 -0.016175 7.309261 -0.002 0.9982
## season37 -0.017973 7.309673 -0.002 0.9980
## season38 0.646897 7.310127 0.088 0.9297
## season39 -0.021567 7.310625 -0.003 0.9977
## season40 0.309969 7.311166 0.042 0.9663
## season41 -0.025162 7.311750 -0.003 0.9973
## season42 -0.026959 7.312377 -0.004 0.9971
## season43 -0.028756 7.313048 -0.004 0.9969
## season44 -0.030554 7.313762 -0.004 0.9967
## season45 11.967649 7.314519 1.636 0.1048
## season46 12.632519 7.315319 1.727 0.0872 .
## season47 -0.035945 7.316162 -0.005 0.9961
## season48 0.295591 7.317049 0.040 0.9679
## season49 -0.039540 7.317978 -0.005 0.9957
## season50 0.291996 7.318951 0.040 0.9683
## season51 -0.043134 7.319967 -0.006 0.9953
## season52 -0.044932 7.321026 -0.006 0.9951
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.568 on 104 degrees of freedom
## Multiple R-squared: 0.3375, Adjusted R-squared: 0.006197
## F-statistic: 1.019 on 52 and 104 DF, p-value: 0.4587
Forecast plot for sub-meter 1:
## Create sub-meter 1 forecast with confidence levels 80 and 90
forecastfitSM1c <- forecast(fitSM1, h=20, level=c(80,90))
## Plot sub-meter 1 forecast, limit y and add labels
plot(forecastfitSM1c, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Kitchen Forecast")
plotForecastErrors <- function(forecasterrors)
{
# make a histogram of the forecast errors:
mybinsize <- IQR(forecasterrors)/4
mysd <- sd(forecasterrors)
mymin <- min(forecasterrors) - mysd*5
mymax <- max(forecasterrors) + mysd*3
# generate normally distributed data with mean 0 and standard deviation mysd
mynorm <- rnorm(10000, mean=0, sd=mysd)
mymin2 <- min(mynorm)
mymax2 <- max(mynorm)
if (mymin2 < mymin) { mymin <- mymin2 }
if (mymax2 > mymax) { mymax <- mymax2 }
# make a red histogram of the forecast errors, with the normally distributed data overlaid:
mybins <- seq(mymin, mymax, mybinsize)
hist(forecasterrors, col="red", freq=FALSE, breaks=mybins)
# freq=FALSE ensures the area under the histogram = 1
# generate normally distributed data with mean 0 and standard deviation mysd
myhist <- hist(mynorm, plot=FALSE, breaks=mybins)
# plot the normal curve as a blue line on top of the histogram of forecast errors:
print(points(myhist$mids, myhist$density, type="l", col="blue", lwd=2))
}
All three models perform similarly.
rsme_list <- c(9.568, 7.403, 8.122)
rsquared_list <- c(0.3375, 0.3006, 0.3544)
model_comparison <- data.frame(rsme_list, rsquared_list)
colnames(model_comparison) <- c("RSME", "RSquared")
rownames(model_comparison) <- c("Kitchen Prediction", "Laundry Room Forecast Prediction", "Water Heater & AC Prediction")
model_comparison
Also it seems model’s forecast errors do not follow a normal distribution, with center in 0, which indicates the models could be improved further more.
plotForecastErrors(forecastfitSM1c$residuals) # make a histogram
plotForecastErrors(forecastfitSM2c$residuals) # make a histogram
plotForecastErrors(forecastfitSM3c$residuals) # make a histogram
## Decompose Sub-meter 1 into trend, seasonal and remainder
components070809SM1weekly <- decompose(tsSM1_070809weekly)
## Plot decomposed sub-meter 1
autoplot(components070809SM1weekly, main="Kitchen decomposition of additive time series")
## Check summary statistics for decomposed sub-meter 1
print("Seasonal")
## [1] "Seasonal"
summary(components070809SM1weekly$seasonal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.59190 -2.10632 -1.84190 -0.01167 -1.71209 17.62925
print("Trend")
## [1] "Trend"
summary(components070809SM1weekly$trend)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.365 1.808 2.558 2.342 2.904 3.538 52
print("Random")
## [1] "Random"
summary(components070809SM1weekly$random)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -20.5331 -0.8937 -0.3312 -0.3120 0.2361 24.2650 52
## Decompose Sub-meter 2 into trend, seasonal and remainder
components070809SM2weekly <- decompose(tsSM2_070809weekly)
## Plot decomposed sub-meter 2
autoplot(components070809SM2weekly, main="Laundry Room decomposition of additive time series")
## Check summary statistics for decomposed sub-meter 2
print("Seasonal")
## [1] "Seasonal"
summary(components070809SM2weekly$seasonal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.102163 -2.025240 -1.525240 -0.009684 -1.015625 16.902644
print("Trend")
## [1] "Trend"
summary(components070809SM2weekly$trend)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.692 1.817 1.789 2.019 2.933 52
print("Random")
## [1] "Random"
summary(components070809SM2weekly$random)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -18.5950 -0.3353 0.1166 0.1166 0.5685 18.8281 52
## Decompose Sub-meter 3 into trend, seasonal and remainder
components070809SM3weekly <- decompose(tsSM3_070809weekly)
## Plot decomposed sub-meter 3
autoplot(components070809SM3weekly, main = "Water Heater & AC Forecast decomposition of additive time series")
## Check summary statistics for decomposed sub-meter 3
print("Seasonal")
## [1] "Seasonal"
summary(components070809SM3weekly$seasonal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -6.37999 -6.01942 2.37481 0.07089 3.12481 11.97578
print("Trend")
## [1] "Trend"
summary(components070809SM3weekly$trend)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.288 5.769 6.106 6.046 6.308 6.981 52
print("Random")
## [1] "Random"
summary(components070809SM3weekly$random)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -13.0191 -0.4998 0.1252 0.1348 0.7454 13.2887 52
## Seasonal adjusting sub-meter 3 by subtracting the seasonal component & plot
tsSM3_070809Adjusted <- tsSM3_070809weekly - components070809SM3weekly$seasonal
autoplot(tsSM3_070809Adjusted, main = "Water Heater & AC Forecast Holt-Winters Forecasting")
## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
autoplot(decompose(tsSM3_070809Adjusted), main = "Adjusted Water Heater & AC decomposition of additive time series")
“Yes there is a seasonal line, but look at the scale for the seasonal section. -1e-15 through 5e-16. That’s a decimal with 15 zeros before 1. A very very small number indeed. For all practical purposes the seasonality has been removed.”
## Holt Winters Exponential Smoothing & Plot
tsSM3_HW070809 <- HoltWinters(tsSM3_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM3_HW070809, ylim = c(0, 25), main="Water Heater & AC Holt-Winters filtering")
Perform a new forecast on the data w/o seasonal data:
## HoltWinters forecast & plot
tsSM3_HW070809for <- forecast(tsSM3_HW070809, h=25)
plot(tsSM3_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time - Water Heather and AC", main="Water Heater & AC Forecasts from Holt-Winters")
## Forecast HoltWinters with diminished confidence levels
tsSM3_HW070809forC <- forecast(tsSM3_HW070809, h=25, level=c(10,25))
## Plot only the forecasted area
plot(tsSM3_HW070809forC, ylim = c(0, 9), ylab= "Watt-Hours", xlab="Time - Water Heather and AC", start(2010), main = "Water Heater & AC Holt-Winters forecasts")
### TODO: how is this plot different from the forecasted plot from step three of the plan of attack? Which, if any, is more useful?
## Seasonal adjusting sub-meter 2 by subtracting the seasonal component & plot
tsSM2_070809Adjusted <- tsSM2_070809weekly - components070809SM2weekly$seasonal
autoplot(tsSM2_070809Adjusted, main = "Laundry Room Holt-Winters Forecasting")
## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
autoplot(decompose(tsSM2_070809Adjusted), main = "Adjusted Laundry Room decomposition of additive time series")
## Holt Winters Exponential Smoothing & Plot
tsSM2_HW070809 <- HoltWinters(tsSM2_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM2_HW070809, ylim = c(0, 25), main = "Laundry Room Holt-Winters filtering")
Perform a new forecast on the data w/o seasonal data:
## HoltWinters forecast & plot
tsSM2_HW070809for <- forecast(tsSM2_HW070809, h=25)
plot(tsSM2_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Laundry Room forecasts from Holt-Winters")
## Forecast HoltWinters with diminished confidence levels
tsSM2_HW070809forC <- forecast(tsSM2_HW070809, h=25, level=c(10,25))
## Plot only the forecasted area
plot(tsSM2_HW070809forC, ylim = c(0, 5), ylab= "Watt-Hours", xlab="Time", start(2010), main = "Laundry Room Holt-Winters forecasts")
## Seasonal adjusting sub-meter 1 by subtracting the seasonal component & plot
tsSM1_070809Adjusted <- tsSM1_070809weekly - components070809SM1weekly$seasonal
autoplot(tsSM1_070809Adjusted, main = "Kitchen Holt-Winters Forecasting")
## Test Seasonal Adjustment by running Decompose again. Note the very, very small scale for Seasonal
autoplot(decompose(tsSM1_070809Adjusted), main = "Adjusted Kitchen decomposition of additive time series ")
## Holt Winters Exponential Smoothing & Plot
tsSM1_HW070809 <- HoltWinters(tsSM1_070809Adjusted, beta=FALSE, gamma=FALSE)
plot(tsSM1_HW070809, ylim = c(0, 25), main = "Kitchen Holt-Winters filtering")
Perform a new forecast on the data w/o seasonal data:
## HoltWinters forecast & plot
tsSM1_HW070809for <- forecast(tsSM1_HW070809, h=25)
plot(tsSM1_HW070809for, ylim = c(0, 20), ylab= "Watt-Hours", xlab="Time", main = "Kitchen Holt-Winters Forecasts")
## Forecast HoltWinters with diminished confidence levels
tsSM1_HW070809forC <- forecast(tsSM1_HW070809, h=25, level=c(10,25))
## Plot only the forecasted area
plot(tsSM1_HW070809forC, ylim = c(0, 5), ylab= "Watt-Hours", xlab="Time", start(2010), main = "Kitchen Holt-Winters Forecasts")